From Kaggle
# Loading email data set: consists of "text data" and "dummy variable"
email_df <- data.table::fread( "spam_or_not_spam.csv", header = T )
# Randomizing data
set.seed(46234)
email_df <- email_df[ sample( 1:nrow( email_df ) ), ]
# Factoring dummy variable
email_df$label <- factor( email_df$label )library(pacman)
p_load(
# Basics
here, skimr, dplyr, stringr, fastverse, disk.frame,
# Visualizing
ggplot2, ggthemes, ggthemes, wordcloud, RColorBrewer,
# Text Processing
tm, SnowballC,
# Modelling
e1071,naivebayes, tidymodels, gridExtra, caret, ranger,
# Knitting
knitr, kableExtra, DT, shiny, equatiomatic
)To retrieve the root of a word (eg, doing -> do), options are āstemmingā &ālemmatizationā.
STEMMING: faster but maybe not as effective
LEMMATIZATION: slower but more effective
More on this here
VectorSource(): creates one document for each email
Vcorpus(): creates a volatile corpus from these individual emails
email_corpus <- VCorpus(
VectorSource(
email_df$email
)
)
# Using `tm` package to stem email content
email_corpus <- tm::tm_map( email_corpus,
tm::stemDocument )
# Removing puctuations
email_corpus = tm_map( email_corpus,
removePunctuation )
# Removing stopwords
email_corpus <- tm_map( email_corpus,
removeWords, stopwords( "en" ) )
# Removing two most frequent stopwords: "NUMBER", "URL"
email_corpus <- tm_map( email_corpus,
removeWords, c("NUMBER", "number", "url", "URL") )
# DocumentTermMatrix(): tokenize the email corpus.
email_dtm <- tm::DocumentTermMatrix( sample( email_corpus, length( email_corpus ) ) )Using wordclouds, visualize text data after cleaning and pre-processing
# Preprocessed data for visualizations
reverse_email <- data.frame(
text = sapply( email_corpus, as.character ),
stringsAsFactors = FALSE,
type = email_df$label
)# All emails
wordcloud( reverse_email$text,
max.words = 150,
colors = brewer.pal( 7, "Dark2" ),
random.order = FALSE ) # Subsetting to spam == 1
spam <- reverse_email %>% filter( type == 1 )
wordcloud( spam$text,
max.words = 150,
colors = brewer.pal( 7, "Dark2" ),
random.order = FALSE,
main = "Spam")# Subsetting to spam == 0
ham <- reverse_email %>% filter( type == 0 )
wordcloud( ham$text,
max.words = 150,
colors = brewer.pal( 7, "Dark2" ),
random.order = FALSE,
main = "Non-Spam") Earlier we randomly sorted the data, so we can use the indices to split 80% training and 20% testing.
# Split 80% training, 20% testing
email_dtm_train <- email_dtm[ 1:2400, ]
email_dtm_test <- email_dtm[2401:3000, ]
# Add labels for convenience
email_train_label <- email_df[ 1:2400, ]$label
email_test_label <- email_df[2401:3000, ]$label# Create table
prop_table = data.frame(c(prop.table( table( email_train_label ) )[[2]], #Train
prop.table( table( email_train_label ) )[[1]]),
c(prop.table( table( email_test_label ) )[[2]], # Test
prop.table( table( email_test_label ) )[[1]])
)
# Add table headings
rownames(prop_table) = c("Spam", "Non-Spam")
names(prop_table) = c("Train", "Test")
# View table
kable(prop_table, digits = 3) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))| Train | Test | |
|---|---|---|
| Spam | 0.17 | 0.152 |
| Non-Spam | 0.83 | 0.848 |
There are currently 25,050 variables, which is most likely way too many! So, define a threshold (eg. 1 == 1%) and reduce the number of features used.
Goal: Eliminate words that appear in only 10% of records in the training data
min_freq <- round( email_dtm$nrow * ( ( threshold = 10.0 ) / 100 ), 0 ) # using 10%# Create vector of most frequent words
freq_words <- findFreqTerms( x = email_dtm,
lowfreq = min_freq )
# Filter the DTM
email_dtm_freq_train <- email_dtm_train[ , freq_words]
email_dtm_freq_test <- email_dtm_test[ , freq_words]# Create table
freq_words_plus_1 = c(freq_words, "")
freq_words_dt = data.frame(
c(freq_words_plus_1[seq( 1+0*1,16*1)]),
c(freq_words_plus_1[seq(1*16+1,16*2)]),
c(freq_words_plus_1[seq(2*16+1,16*3)]),
c(freq_words_plus_1[seq(3*16+1,16*4)]),
c(freq_words_plus_1[seq(4*16+1,16*5)]),
c(freq_words_plus_1[seq(5*16+1,16*6)]),
c(freq_words_plus_1[seq(6*16+1,16*7)]),
c(freq_words_plus_1[seq(7*16+1,16*8)]),
c(freq_words_plus_1[seq(8*16+1,16*9)]),
c(freq_words_plus_1[seq(9*16+1,16*10)]),
c(freq_words_plus_1[seq(10*16+1,16*11)]),
c(freq_words_plus_1[seq(11*16+1,16*12)]),
c(freq_words_plus_1[seq(12*16+1,16*13)])
)
# View table
kable(freq_words_dt, digits = 3, col.names = NULL) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
column_spec (1:13,border_left = T, border_right = T) %>%
row_spec(1:16, extra_css = "border-bottom: 1px solid")| actual | can | differ | found | instal | long | much | onli | problem | right | site | test | want |
| address | case | doe | free | interest | look | must | order | process | rpm | softwar | thank | way |
| also | chang | doesn | get | internet | lot | name | origin | product | run | someth | thing | web |
| american | check | don | give | invest | made | nation | packag | program | said | spam | think | week |
| ani | click | good | issu | need | part | provid | say | spamassassin | time | well | ||
| anoth | code | end | got | just | make | net | peopl | put | secur | sponsor | today | whi |
| anyon | com | error | govern | keep | manag | network | perl | rate | see | start | trade | will |
| back | come | even | great | know | mani | never | person | razor | seem | state | tri | window |
| base | compani | everi | group | last | market | new | phone | read | send | still | two | within |
| becaus | comput | exmh | help | life | may | next | place | real | sent | subject | type | without |
| befor | countri | file | high | like | mean | now | pleas | realli | sep | support | unit | work |
| best | current | find | home | line | messag | numbertnumber | point | receiv | server | sure | unsubscrib | world |
| better | data | first | hyperlink | link | might | offer | post | releas | servic | system | use | write |
| build | date | follow | idea | linux | million | old | power | remov | set | take | user | wrote |
| busi | day | fork | includ | list | money | onc | price | report | show | talk | veri | year |
| call | develop | form | inform | live | month | one | probabl | requir | sinc | technolog | version |
# Simple dummy transformation fn.
convert_values <- function(x){
x = ifelse( x > 0, "Yes", "No" )
}
# Declaring final `train` and `test` datasets
email_train <- apply( email_dtm_freq_train, MARGIN = 2,
convert_values )
email_test <- apply( email_dtm_freq_test, MARGIN = 2,
convert_values )# View data
datatable(email_train, rownames = FALSE, filter="none", options = list(pageLength = 5, scrollX=T) )# Function to create confusion matrix
## Source https://stackoverflow.com/questions/23891140/r-how-to-visualize-confusion-matrix-using-the-caret-package
# Draw confusion matric
draw_confusion_matrix <- function(cm) {
total <- sum(cm$table)
res <- as.numeric(cm$table)
# Generate color gradients. Palettes come from RColorBrewer.
greenPalette <- c("#F7FCF5","#E5F5E0","#C7E9C0","#A1D99B","#74C476","#41AB5D","#238B45","#006D2C","#00441B")
redPalette <- c("#FFF5F0","#FEE0D2","#FCBBA1","#FC9272","#FB6A4A","#EF3B2C","#CB181D","#A50F15","#67000D")
getColor <- function (greenOrRed = "green", amount = 0) {
if (amount == 0)
return("#FFFFFF")
palette <- greenPalette
if (greenOrRed == "red")
palette <- redPalette
colorRampPalette(palette)(100)[10 + ceiling(90 * amount / total)]
}
# Set the basic layout
layout(matrix(c(1,1,2)))
par(mar=c(2,2,2,2))
plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
title('Confusion Matrix', cex.main=2)
# Create the matrix
classes = colnames(cm$table)
rect(150, 430, 240, 370, col=getColor("green", res[1]))
text(195, 435, "Non-Spam", cex=1.2)
rect(250, 430, 340, 370, col=getColor("red", res[3]))
text(295, 435, "Spam", cex=1.2)
text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
text(245, 450, 'Actual', cex=1.3, font=2)
rect(150, 305, 240, 365, col=getColor("red", res[2]))
rect(250, 305, 340, 365, col=getColor("green", res[4]))
text(140, 400, "Non-Spam", cex=1.2, srt=90)
text(140, 335, "Spam", cex=1.2, srt=90)
# Add in the cm results
text(195, 400, res[1], cex=1.6, font=2, col='black')
text(195, 335, res[2], cex=1.6, font=2, col='black')
text(295, 400, res[3], cex=1.6, font=2, col='black')
text(295, 335, res[4], cex=1.6, font=2, col='black')
# Add in the specifics
plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "Metrics", xaxt='n', yaxt='n')
text(10, 85, names(cm$byClass[1]), cex=1.2, font=2)
text(10, 70, round(as.numeric(cm$byClass[1]), 3), cex=1.2)
text(30, 85, names(cm$byClass[2]), cex=1.2, font=2)
text(30, 70, round(as.numeric(cm$byClass[2]), 3), cex=1.2)
text(50, 85, names(cm$byClass[5]), cex=1.2, font=2)
text(50, 70, round(as.numeric(cm$byClass[5]), 3), cex=1.2)
text(70, 85, names(cm$byClass[6]), cex=1.2, font=2)
text(70, 70, round(as.numeric(cm$byClass[6]), 3), cex=1.2)
text(90, 85, names(cm$byClass[7]), cex=1.2, font=2)
text(90, 70, round(as.numeric(cm$byClass[7]), 3), cex=1.2)
# Add in accuracy information
text(30, 35, names(cm$overall[1]), cex=1.5, font=2)
text(30, 20, round(as.numeric(cm$overall[1]), 3), cex=1.4)
text(70, 35, names(cm$overall[2]), cex=1.5, font=2)
text(70, 20, round(as.numeric(cm$overall[2]), 3), cex=1.4)
} # Create model from the training dataset
bayes_classifier <- e1071::naiveBayes( email_train,
email_train_label )
# Predicting on test data
email_test_pred <- predict( bayes_classifier,
email_test )# Display results
naive_bayes_results = confusionMatrix( data = email_test_pred,
reference = email_test_label,
positive = "1",
dnn = c("Prediction", "Actual") )
draw_confusion_matrix(naive_bayes_results)# Splitting for 5-fold cross-validation
folds <- email_train %>% vfold_cv(v = 5)
# Defining Lambdas (from Lecture 005)
lambdas <- data.frame( penalty = c( 0, 10^seq( from = 5, to = -2, length = 100 ) ) )
# Defining the recipe
data_recipe <- recipe(
email_train_label ~ .,
data = email_train
) %>%
update_role(V1, new_role = 'id variable') %>%
step_dummy(all_nominal(), - all_outcomes())
# Defining Lasso Model
lasso <- linear_reg(
penalty = tune(),
mixture = 1) %>%
set_engine("glmnet")
# Setting up workflow
workflow_lasso <- workflow() %>%
add_recipe( data_recipe ) %>%
add_model( lasso )
# Parallelize
doParallel::registerDoParallel(cores = 4)
# CV
lasso_cv <- workflow_lasso %>%
tune_grid(
resamples = folds,
grid = lambdas,
metrics = metric_set(rmse, mae)
)# Find best models
## Source: juliasilge.com/blog/lasso-the-office/
# Graph results
lasso_cv %>% collect_metrics() %>%
ggplot(aes(penalty, mean, color = .metric)) +
geom_errorbar(aes(
ymin = mean - std_err,
ymax = mean + std_err
),
alpha = 0.5
) +
geom_line(size = 1.5) +
facet_wrap(~.metric, scales = "free", nrow = 2) +
theme(legend.position = "none") + theme_base() +
scale_x_log10()# Get best penalties
best_lasso_mae = lasso_cv %>% show_best(metric = "mae") %>% filter(penalty == min(penalty))
best_lasso_rmse = lasso_cv %>% show_best(metric = "rmse") %>% filter(penalty == min(penalty))
best_lasso = rbind(best_lasso_mae, best_lasso_rmse)
# View in table
names(best_lasso) = c("Penalty", "Metric", "Estimator", "Mean", "n", "Standard Error", ".config")
kable(best_lasso, digits = 3) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))| Penalty | Metric | Estimator | Mean | n | Standard Error | .config |
|---|---|---|---|---|---|---|
| 0.031 | mae | standard | 0.283 | 5 | 0.002 | Preprocessor1_Model009 |
| 0.023 | rmse | standard | 0.376 | 5 | 0.004 | Preprocessor1_Model007 |
# Get test metrics
## Define best model - according to MAE
best_lasso_workflow = workflow_lasso %>%
finalize_workflow(select_best(lasso_cv, metric = "mae")) %>%
fit(data = email_train)
best_lasso = best_lasso_workflow %>% extract_fit_parsnip()
## Clean test data
email_test_clean = recipe(
email_test_label ~ .,
data = email_test
) %>%
update_role(V1, new_role = 'id variable') %>%
step_dummy(all_nominal(), - all_outcomes()) %>%
prep() %>% juice()
## Make predictions
lasso_predictions = predict(best_lasso, email_test_clean)
lasso_predictions = lasso_predictions %>% mutate(prediction = ifelse(.pred < 0.5, 0, 1))
email_test_clean$predictions = lasso_predictions$prediction
## Calculate accuracy
email_test_clean = email_test_clean %>% mutate(accurate = ifelse(predictions == email_test_label, 1, 0))
acc = sum(email_test_clean$accurate) / nrow(email_test_clean)
## Calculate sensitivity
tp = 0 # our model predicts not-spam for all, so no true positives
fn = email_test_clean %>% filter(email_test_label == 1) %>% nrow()
sens = tp / (tp + fn)
## Calculate specificity
fp = 0 # our model predicts not-spam for all, so no false positives
tn = email_test_clean %>% filter(email_test_label == 0) %>% nrow()
spec = tn / (tn + fp)
## Make table
lasso_table = tibble(
metric = c("accuracy", "sensitivity", "specificity"),
value = c(acc, sens, spec)
)
lasso_table %>% kable(digits = 3) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))| metric | value |
|---|---|
| accuracy | 0.848 |
| sensitivity | 0.000 |
| specificity | 1.000 |
# Set seed
set.seed(9753)
# Converting outcome variable into character vector
email_train <- email_train %>%
mutate(
outcome_as_vector = ifelse(email_train_label == 1, "Yes", "No")
)
# Split for 5-fold cross-validation
folds <- vfold_cv(email_train, v = 5)
# Defining the recipe
data_recipe <- recipe(
outcome_as_vector ~ .,
data = email_train
) %>%
step_rm(email_train_label) %>%
update_role(V1, new_role = 'id variable') %>%
step_dummy(all_nominal(), - all_outcomes()) %>%
step_zv(all_predictors()) %>%
step_normalize(all_predictors())
# Defining Lambdas (from Lecture 005)
lambdas <- data.frame( penalty = c( 0, 10^seq( from = 5, to = -2, length = 100 ) ) )
# Defining Lasso Model
log_lasso <- logistic_reg(
mode = 'classification',
penalty = tune(),
mixture = 1) %>%
set_engine("glmnet")
# Setting up workflow
workflow_lasso <- workflow() %>%
add_recipe( data_recipe ) %>%
add_model( log_lasso )
# CV
log_lasso_cv <- workflow_lasso %>%
tune_grid(
resamples = folds,
metrics = metric_set(yardstick::accuracy,
yardstick::roc_auc,
yardstick::sens,
yardstick::spec,
yardstick::precision),
grid = grid_latin_hypercube(penalty(), size = 5),
control = control_grid(parallel_over = 'resamples')
)# Find test metrics
log_lasso_cv_results = log_lasso_cv %>% collect_metrics() %>% group_by(.metric) %>% summarize(mean_accuracy = mean(mean, na.rm = T))
log_lasso_cv_results[1] = c("Accuracy", "Precision", "Area Under the Curve", "Sensitivity", "Specificity")
names(log_lasso_cv_results) = c("Metric", "Mean")
## View table
kable(log_lasso_cv_results, digits = 3) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))| Metric | Mean |
|---|---|
| Accuracy | 0.809 |
| Precision | 0.831 |
| Area Under the Curve | 0.508 |
| Sensitivity | 0.967 |
| Specificity | 0.040 |
# Set seed
set.seed(9753)
# Split for 5-fold cross-validation
folds <- vfold_cv(email_train, v = 5)
# Defining the recipe
data_recipe <- recipe(
outcome_as_vector ~ .,
data = email_train
) %>%
step_rm(email_train_label) %>%
update_role(V1, new_role = 'id variable') %>%
step_dummy(all_nominal(), - all_outcomes()) %>%
step_zv(all_predictors()) %>%
step_normalize(all_predictors())
# Define the model - simple LogReg because no penalty
model_logistic <- logistic_reg(
mode = 'classification') %>%
set_engine('glm')
# Define the workflow
workflow_logistic <- workflow() %>%
add_recipe(data_recipe) %>%
add_model(model_logistic)
# CV
cv_logistic <- workflow_logistic %>%
fit_resamples(
resamples = folds,
metrics = metric_set(yardstick::accuracy,
yardstick::roc_auc,
yardstick::sens,
yardstick::spec,
yardstick::precision)
)# Get test metrics
log_reg_results = cv_logistic %>% collect_metrics() %>% select(.metric, mean)
log_reg_results[1] = c("Accuracy", "Precision", "Area Under the Curve", "Sensitivity", "Specificity")
names(log_reg_results) = c("Metric", "Mean")
# View in table
kable(log_reg_results, digits = 3) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))| Metric | Mean |
|---|---|
| Accuracy | 0.797 |
| Precision | 0.832 |
| Area Under the Curve | 0.511 |
| Sensitivity | 0.947 |
| Specificity | 0.071 |
# Train the model - using 200 trees
random_forest <- train(
x = email_train_rf,
y = email_train_label_rf,
method = "ranger",
num.trees = 200,
importance = "impurity",
trControl = trainControl(method = "cv",
number = 3,
verboseIter = TRUE
)
)## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 2, splitrule = gini, min.node.size = 1 on full training set
# Check variable importances
top_25words = varImp(random_forest, scale = TRUE)$importance %>%
rownames_to_column() %>%
arrange(-Overall) %>%
top_n(25)
# Plot variable importance
ggplot(data = top_25words,
aes(x=reorder(rowname, Overall),
y = Overall)) +
geom_bar(stat = "identity") +
theme_base() +
theme(axis.text.x=element_text(angle=50, hjust=1))+
xlab("Top 25 Predictive Words (stemmed)")+
ylab("Frequency of Word") +
labs(title = "Most Predictive Words") +
theme(plot.title = element_text(hjust = 0.5))# Re-declaring test data
email_test <- apply( email_dtm_freq_test, MARGIN = 2,
convert_values )
email_test <- cbind( email_test,
email_test_label )
# Predict on test data
predictions <- predict(random_forest, email_test)
# View test metrics in confusion matrix
random_forest_results = confusionMatrix( data = predictions,
reference = email_test_label,
positive = "1",
dnn = c("Prediction", "Actual") )
draw_confusion_matrix(random_forest_results)# Creating df with metrics of all models
comparing_acc_table = data.frame(
c(
"Naive Bayes",
"Lasso",
"Logistic Lasso",
"Logistic",
"Random Forest"
),
c(
naive_bayes_results[["overall"]][["Accuracy"]],
acc,
log_lasso_cv_results$Mean[1],
log_reg_results$Mean[1],
random_forest_results[["overall"]][["Accuracy"]]
),
c(
naive_bayes_results[["byClass"]][["Sensitivity"]],
sens,
log_lasso_cv_results$Mean[4],
log_reg_results$Mean[4],
random_forest_results[["byClass"]][["Sensitivity"]]
),
c(
naive_bayes_results[["byClass"]][["Specificity"]],
spec,
log_lasso_cv_results$Mean[5],
log_reg_results$Mean[5],
random_forest_results[["byClass"]][["Specificity"]]
),
c(
naive_bayes_results[["byClass"]][["Precision"]],
# ifelse((tp/(fp+tp))) == NaN, 0, (tp/(fp+tp)),
0, #hardcoded
log_lasso_cv_results$Mean[2],
log_reg_results$Mean[2],
random_forest_results[["byClass"]][["Precision"]]
)
)
names(comparing_acc_table) = c("Method", "Accuracy", "Sensitivity", "Specificity", "Precision")
# View table
kable(comparing_acc_table, digits = 3) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))| Method | Accuracy | Sensitivity | Specificity | Precision |
|---|---|---|---|---|
| Naive Bayes | 0.788 | 0.044 | 0.921 | 0.091 |
| Lasso | 0.848 | 0.000 | 1.000 | 0.000 |
| Logistic Lasso | 0.809 | 0.967 | 0.040 | 0.831 |
| Logistic | 0.797 | 0.947 | 0.071 | 0.832 |
| Random Forest | 0.847 | 0.000 | 0.998 | 0.000 |